The goal of this notebook is to compare pseudobulk and bulk
calculations to determine which pseudobulk calculation should we proceed
with for modeling: the sum of logcounts
(pseudobulk_logcounts) or the
DESeq2-normalized sum of raw counts
(pseudobulk_deseq). To this end, we’ll visualize expression
distributions, both on their own and compared to bulk TPM.
renv::load()
library(ggplot2)
theme_set(theme_bw())
data_dir <- here::here("analysis", "pseudobulk-bulk-prediction", "data")
tpm_dir <- file.path(data_dir, "tpm")
pseudobulk_dir <- file.path(data_dir, "pseudobulk")
tpm_files <- list.files(
path = tpm_dir,
full.names = TRUE,
pattern = "*-tpm.tsv$"
)
tpm_names <- stringr::str_split_i(basename(tpm_files), pattern = "-", i = 1)
names(tpm_files) <- tpm_names
pseudobulk_files <- list.files(
path = pseudobulk_dir,
full.names = TRUE,
pattern = "*-pseudobulk.rds$"
)
pseudobulk_names <- stringr::str_split_i(basename(pseudobulk_files), pattern = "-", i = 1)
names(pseudobulk_files) <- pseudobulk_names
# Make sure we have the same projects, in the same order
stopifnot(
all.equal(names(tpm_files), names(pseudobulk_files))
)
Data are saved as matrices, so we’ll convert them to per-project long data frames here.
tpm_df_list <- tpm_files |>
purrr::map(
\(tpm_file) {
readr::read_tsv(tpm_file, show_col_types = FALSE) |>
tidyr::pivot_longer(
-ensembl_id,
names_to = "sample_id",
values_to = "expression"
) |>
dplyr::mutate(expression_type = "bulk_tpm")
}
)
project_df_list <- purrr::map2(
pseudobulk_files,
tpm_df_list,
\(pseudo_file, tpm_df) {
pseudo_list <- readr::read_rds(pseudo_file)
# Filter tpm_df for genes in the pseudobulk
tpm_filtered_df <- tpm_df |>
dplyr::filter(ensembl_id %in% rownames(pseudo_list[[1]]))
# combine pseudobulk matrices into a long data frame and bind with tpm
combined_df_long <- pseudo_list |>
purrr::map(
\(mat) {
mat |>
as.data.frame() |>
tibble::rownames_to_column("ensembl_id") |>
tidyr::pivot_longer(
-ensembl_id,
names_to = "sample_id",
values_to = "expression"
)
}
) |>
purrr::list_rbind(names_to = "expression_type") |>
dplyr::bind_rows(tpm_filtered_df) |>
dplyr::select(sample_id, ensembl_id, expression, expression_type)
}
)
shared_genes <- purrr::map(
project_df_list,
\(df) df$ensembl_id
)
tpm_df_indicator_list <- purrr::map2(
tpm_df_list,
shared_genes,
\(df, ids) dplyr::mutate(df, in_pseudo = ensembl_id %in% ids)
)
# clean up!
rm(shared_genes)
rm(tpm_df_list)
When establishing pseudobulk datasets, genes with low expression levels were removed.
Before looking at pseudobulk measures here, we’ll look at how the TPM distributions (log2) compared between for genes that are and are not included in the pseudobulk datasets. the comparisons. We expect to see that genes not included in the pseudobulk have lower TPM.
tpm_df_indicator_list |>
purrr::imap(
\(df, project_id) {
ggplot(df) +
aes(x = log2(expression), fill = in_pseudo) +
geom_density(alpha = 0.5) +
labs(
title = project_id,
x = "log2 bulk TPM"
) +
theme(legend.position = "bottom")
}
) |>
patchwork::wrap_plots(guides = "collect", nrow = 1) & theme(legend.position = "bottom")
We mostly see the expected trend: Genes included in the pseudobulk
counts have higher expression in bulk compared to genes which were
removed. SCPCP000009 is an exception here, but this project
has fewer samples compared to the rest so this could be a power
artifact:
tpm_df_indicator_list |>
purrr::map(
\(df) length(unique(df$sample_id))
)
$SCPCP000001
[1] 18
$SCPCP000002
[1] 20
$SCPCP000006
[1] 39
$SCPCP000009
[1] 3
$SCPCP000017
[1] 24
Pause to clean up a little!
rm(tpm_df_indicator_list)
First, let’s get a general sense of the scale of values in the pseudobulk quantities:
## logcounts
project_df_list |>
purrr::map(
\(df) {
df |>
dplyr::filter(expression_type == "pseudobulk_logcounts") |>
dplyr::pull(expression) |>
summary()
})
$SCPCP000001
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 3.588 29.944 219.697 194.455 23756.843
$SCPCP000002
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 4.82 37.60 286.34 225.83 47645.99
$SCPCP000006
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 3.47 23.37 250.10 147.66 138125.78
$SCPCP000009
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 12.47 53.09 376.99 219.65 194487.03
$SCPCP000017
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 3.31 32.08 343.49 290.80 99994.82
## deseq
project_df_list |>
purrr::map(
\(df) {
df |>
dplyr::filter(expression_type == "pseudobulk_deseq") |>
dplyr::pull(expression) |>
summary()
})
$SCPCP000001
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.508 2.354 5.394 5.188 7.881 20.706
$SCPCP000002
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.722 2.784 5.577 5.458 7.942 19.239
$SCPCP000006
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.510 1.986 4.661 4.659 7.182 18.286
$SCPCP000009
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1023 3.9929 6.1419 6.1122 8.0571 19.7752
$SCPCP000017
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.330 2.093 5.243 5.226 8.288 22.893
The logcounts version has some strong right-skew, so we should log those values to support modeling. We don’t see the same level of skew from DESeq, which is good since we don’t expect to see it out of that normalization procedure; we don’t need to transform those.
Let’s therefore update expression to be log2 for both TPM and pseudobulk logcounts (i.e., for all but pseudobulk deseq) since we’ll use those scales moving forward:
project_df_list <- project_df_list |>
purrr::map(
\(df) {
df |>
dplyr::mutate(expression = ifelse(
stringr::str_detect(expression_type, "deseq"),
expression,
log2(expression) # TODO: do we want a fudge here?
))
}
)
Now, we can visualize distributions of all quantities:
project_df_list |>
purrr::imap(
\(df, project_id) {
ggplot(df) +
aes(x = expression, fill = expression_type) +
geom_density(alpha = 0.5) +
scale_fill_brewer(palette = "Dark2") +
facet_wrap(vars(expression_type), scales = "free", nrow = 3) +
ggtitle(project_id) +
theme(legend.position = "none")
}
) |>
patchwork::wrap_plots(guides = "collect", nrow = 1)
Warning: Removed 140153 rows containing non-finite outside the scale range
(`stat_density()`).
Warning: Removed 131868 rows containing non-finite outside the scale range
(`stat_density()`).
Warning: Removed 416112 rows containing non-finite outside the scale range
(`stat_density()`).
Warning: Removed 17186 rows containing non-finite outside the scale range
(`stat_density()`).
Warning: Removed 487720 rows containing non-finite outside the scale range
(`stat_density()`).
Both the pseudobulk distributions look reasonable enough!
Now come the many plots: What does the relationship look like between bulk and each flavor of pseudobulk? We’ll visualize this per sample.
We’ll first make a pivoted version of this data to enable plotting and modeling.
project_df_wide_list <- project_df_list |>
purrr::imap(
\(df, project_id) {
df |>
tidyr::pivot_wider(
names_from = expression_type,
values_from = expression
)
})
Next, we have many plots, organized by project:
bulk tpm ~ deseq pseudocountsbulk tpm ~ logcounts pseudocountsdeseq pseudocounts ~ logcounts pseudocounts, to assess
their similarity directlyproject_df_wide_list |>
purrr::imap(
\(df, project_id) {
p1 <- ggplot(df) +
aes(x = pseudobulk_deseq, y = bulk_tpm) +
geom_point(alpha = 0.2, size = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(vars(sample_id), nrow = 5) +
ggtitle("bulk tpm ~ deseq")
p2 <- ggplot(df) +
aes(x = pseudobulk_logcounts, y = bulk_tpm) +
geom_point(alpha = 0.2, size = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(vars(sample_id), nrow = 5) +
ggtitle("bulk tpm ~ logcounts")
p3 <- ggplot(df) +
aes(x = pseudobulk_logcounts, y = pseudobulk_deseq) +
geom_point(alpha = 0.2, size = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(vars(sample_id), nrow = 5) +
ggtitle("deseq ~ logcounts")
patchwork::wrap_plots(p1, p2, p3)
}
) |>
patchwork::wrap_plots(nrow = 5)
On the whole, the relationship between bulk and pseudobulk look
exceptionally similar regardless of which pseudobulk quantity is used!
On the right-side, we see that indeed the pseudobulk versions are highly
similar to one another (right-side panels). The main difference we can
pick out from these plots is that the logcounts version has
a band of lowly-expressed genes which the deseq version
doesn’t appear to show, which again is consistent with their different
origins.
To identify if there’s a meaningful benefit to using one pseudobulk
version over another, we’ll build linear models of
tpm ~ pseudobulk * sample; we can assume an interaction
based on the slopes in the above scatterplots. Note that these are not
final models for analysis, but only used to guide our next steps of
actually building/fine-tuning the models.
We’ll also have a look at the distributions of model residuals along the way.
project_df_wide_list |>
purrr::map(
\(df) {
# Build models
# We first need to remove any undefined values for each model
df_deseq <- df |>
dplyr::filter(is.finite(pseudobulk_deseq), is.finite(bulk_tpm))
fit_deseq <- lm(bulk_tpm ~ pseudobulk_deseq * sample_id, data = df_deseq)
df_logcounts <- df |>
dplyr::filter(is.finite(pseudobulk_logcounts), is.finite(bulk_tpm))
fit_logcounts <- lm(bulk_tpm ~ pseudobulk_logcounts * sample_id, data = df_logcounts)
# Tabulate some fit stats
fit_table <- tibble::tribble(
~expression_type, ~rsquared, ~residual_sd,
"deseq", broom::glance(fit_deseq)$r.squared, broom::glance(fit_deseq)$sigma,
"logcounts", broom::glance(fit_logcounts)$r.squared, broom::glance(fit_logcounts)$sigma
)
# Tabulate some fit stats, wide for easier viewing
broom_deseq <- broom::glance(fit_deseq)
broom_logcounts <- broom::glance(fit_logcounts)
fit_table <- data.frame(
deseq_rsquared = broom_deseq$r.squared,
logcounts_rsquared = broom_logcounts$r.squared,
deseq_residual_sd = broom_deseq$sigma,
logcounts_residual_sd = broom_logcounts$sigma
)
# Plot the residuals as well on the way down
deseq_augment <- broom::augment(fit_deseq)
logcounts_augment <- broom::augment(fit_logcounts)
p1 <- ggplot(deseq_augment) +
aes(x = pseudobulk_deseq, y = .resid) +
geom_point(size = 0.5, alpha = 0.5) +
geom_hline(yintercept = 0, color = "red") +
ggtitle("deseq")
p2 <- ggplot(logcounts_augment) +
aes(x = pseudobulk_logcounts, y = .resid) +
geom_point(size = 0.5, alpha = 0.5) +
geom_hline(yintercept = 0, color = "red") +
ggtitle("logcounts")
print(patchwork::wrap_plots(p1, p2, nrow = 1))
# actually return the fit info
fit_table
}
) |>
purrr::list_rbind(names_to = "project_id")
Models from different pseudobulk quantities are expectedly, based on
scatterplots above, minimal, and based on the residual plots around
linear model assumptions seem reasonably met. The DESeq2
flavor appears to marginally outperform (based on these limited stats)
in 3/5 projects.
Either calculation for pseudobulk appears to be fine, and it’s likely
that we’d get roughly the same results either way. I would suggest to
proceed with the DESeq2 normalized version since we do not
need additional log2 transformations, which leads to loss
of 0-expression genes in the model since they are undefined (unless we
want to fudge factor them, but avoiding this if we can seems best).
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS 15.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_3.5.1
loaded via a namespace (and not attached):
[1] sass_0.4.9 generics_0.1.3 tidyr_1.3.1
[4] renv_1.0.11 stringi_1.8.4 lattice_0.22-6
[7] hms_1.1.3 digest_0.6.37 magrittr_2.0.3
[10] evaluate_1.0.1 grid_4.4.0 RColorBrewer_1.1-3
[13] fastmap_1.2.0 Matrix_1.7-1 rprojroot_2.0.4
[16] jsonlite_1.8.9 backports_1.5.0 BiocManager_1.30.25
[19] mgcv_1.9-1 purrr_1.0.2 scales_1.3.0
[22] jquerylib_0.1.4 cli_3.6.3 rlang_1.1.4
[25] crayon_1.5.3 bit64_4.5.2 munsell_0.5.1
[28] splines_4.4.0 withr_3.0.2 cachem_1.1.0
[31] yaml_2.3.10 tools_4.4.0 parallel_4.4.0
[34] tzdb_0.4.0 dplyr_1.1.4 colorspace_2.1-1
[37] here_1.0.1 broom_1.0.7 vctrs_0.6.5
[40] R6_2.5.1 lifecycle_1.0.4 stringr_1.5.1
[43] bit_4.5.0.1 vroom_1.6.5 pkgconfig_2.0.3
[46] pillar_1.10.0 bslib_0.8.0 gtable_0.3.6
[49] glue_1.8.0 xfun_0.49 tibble_3.2.1
[52] tidyselect_1.2.1 knitr_1.49 farver_2.1.2
[55] htmltools_0.5.8.1 nlme_3.1-166 patchwork_1.3.0
[58] rmarkdown_2.29 labeling_0.4.3 readr_2.1.5
[61] compiler_4.4.0